# Load libraries
library(RCurl)
library(tidyverse)
library(magrittr)
library(plotly)
library(imputeTS)
library(lubridate)
library(timetk)
library(ggcorrplot)
# Load data (relative file path) and view contents
df <- read.csv("../data/energy_dataset.csv")
head(df)
dim(df) # Number of rows and columns
## [1] 35064 29
summary(df) # Distribution and missingness for each variable
## time generation.biomass generation.fossil.brown.coal.lignite
## Length:35064 Min. : 0.0 Min. : 0.0
## Class :character 1st Qu.:333.0 1st Qu.: 0.0
## Mode :character Median :367.0 Median :509.0
## Mean :383.5 Mean :448.1
## 3rd Qu.:433.0 3rd Qu.:757.0
## Max. :592.0 Max. :999.0
## NA's :19 NA's :18
## generation.fossil.coal.derived.gas generation.fossil.gas
## Min. :0 Min. : 0
## 1st Qu.:0 1st Qu.: 4126
## Median :0 Median : 4969
## Mean :0 Mean : 5623
## 3rd Qu.:0 3rd Qu.: 6429
## Max. :0 Max. :20034
## NA's :18 NA's :18
## generation.fossil.hard.coal generation.fossil.oil generation.fossil.oil.shale
## Min. : 0 Min. : 0.0 Min. :0
## 1st Qu.:2527 1st Qu.:263.0 1st Qu.:0
## Median :4474 Median :300.0 Median :0
## Mean :4256 Mean :298.3 Mean :0
## 3rd Qu.:5839 3rd Qu.:330.0 3rd Qu.:0
## Max. :8359 Max. :449.0 Max. :0
## NA's :18 NA's :19 NA's :18
## generation.fossil.peat generation.geothermal
## Min. :0 Min. :0
## 1st Qu.:0 1st Qu.:0
## Median :0 Median :0
## Mean :0 Mean :0
## 3rd Qu.:0 3rd Qu.:0
## Max. :0 Max. :0
## NA's :18 NA's :18
## generation.hydro.pumped.storage.aggregated
## Mode:logical
## NA's:35064
##
##
##
##
##
## generation.hydro.pumped.storage.consumption
## Min. : 0.0
## 1st Qu.: 0.0
## Median : 68.0
## Mean : 475.6
## 3rd Qu.: 616.0
## Max. :4523.0
## NA's :19
## generation.hydro.run.of.river.and.poundage generation.hydro.water.reservoir
## Min. : 0.0 Min. : 0
## 1st Qu.: 637.0 1st Qu.:1077
## Median : 906.0 Median :2164
## Mean : 972.1 Mean :2605
## 3rd Qu.:1250.0 3rd Qu.:3757
## Max. :2000.0 Max. :9728
## NA's :19 NA's :18
## generation.marine generation.nuclear generation.other
## Min. :0 Min. : 0 Min. : 0.00
## 1st Qu.:0 1st Qu.:5760 1st Qu.: 53.00
## Median :0 Median :6566 Median : 57.00
## Mean :0 Mean :6264 Mean : 60.23
## 3rd Qu.:0 3rd Qu.:7025 3rd Qu.: 80.00
## Max. :0 Max. :7117 Max. :106.00
## NA's :19 NA's :17 NA's :18
## generation.other.renewable generation.solar generation.waste
## Min. : 0.00 Min. : 0 Min. : 0.0
## 1st Qu.: 73.00 1st Qu.: 71 1st Qu.:240.0
## Median : 88.00 Median : 616 Median :279.0
## Mean : 85.64 Mean :1433 Mean :269.5
## 3rd Qu.: 97.00 3rd Qu.:2578 3rd Qu.:310.0
## Max. :119.00 Max. :5792 Max. :357.0
## NA's :18 NA's :18 NA's :19
## generation.wind.offshore generation.wind.onshore forecast.solar.day.ahead
## Min. :0 Min. : 0 Min. : 0
## 1st Qu.:0 1st Qu.: 2933 1st Qu.: 69
## Median :0 Median : 4849 Median : 576
## Mean :0 Mean : 5464 Mean :1439
## 3rd Qu.:0 3rd Qu.: 7398 3rd Qu.:2636
## Max. :0 Max. :17436 Max. :5836
## NA's :18 NA's :18
## forecast.wind.offshore.eday.ahead forecast.wind.onshore.day.ahead
## Mode:logical Min. : 237
## NA's:35064 1st Qu.: 2979
## Median : 4855
## Mean : 5471
## 3rd Qu.: 7353
## Max. :17430
##
## total.load.forecast total.load.actual price.day.ahead price.actual
## Min. :18105 Min. :18041 Min. : 2.06 Min. : 9.33
## 1st Qu.:24794 1st Qu.:24808 1st Qu.: 41.49 1st Qu.: 49.35
## Median :28906 Median :28901 Median : 50.52 Median : 58.02
## Mean :28712 Mean :28697 Mean : 49.87 Mean : 57.88
## 3rd Qu.:32263 3rd Qu.:32192 3rd Qu.: 60.53 3rd Qu.: 68.01
## Max. :41390 Max. :41015 Max. :101.99 Max. :116.80
## NA's :36
#describe(df)
There are eight columns that contain no values (all NAs or zeroes). These are removed here.
df_clean <- df %>%
select(-c(generation.fossil.coal.derived.gas,
generation.fossil.oil.shale,
generation.fossil.peat,
generation.geothermal,
generation.marine,
generation.hydro.pumped.storage.aggregated,
generation.wind.offshore,
forecast.wind.offshore.eday.ahead))
colSums(is.na(df_clean)) # Number of missing values for each remaining variable
## time
## 0
## generation.biomass
## 19
## generation.fossil.brown.coal.lignite
## 18
## generation.fossil.gas
## 18
## generation.fossil.hard.coal
## 18
## generation.fossil.oil
## 19
## generation.hydro.pumped.storage.consumption
## 19
## generation.hydro.run.of.river.and.poundage
## 19
## generation.hydro.water.reservoir
## 18
## generation.nuclear
## 17
## generation.other
## 18
## generation.other.renewable
## 18
## generation.solar
## 18
## generation.waste
## 19
## generation.wind.onshore
## 18
## forecast.solar.day.ahead
## 0
## forecast.wind.onshore.day.ahead
## 0
## total.load.forecast
## 0
## total.load.actual
## 36
## price.day.ahead
## 0
## price.actual
## 0
16 out of 22 of the remaining variables each have a maximum of 36 missing values. Looking at where these occur, we see that most missing values occur across variables. Specifically, 90% of the missing values occur over only 18 rows. The remaining 10% are spread across 28 rows.
# Rows where data is missing
df_clean[!complete.cases(df_clean),]
# Number of NAs per row
missing_per_row <- rowSums(is.na(df_clean[!complete.cases(df_clean),]))
missing_per_row
## 100 109 110 111 112 113 114 452 453 644 662 752 753
## 14 15 15 15 15 15 15 14 14 14 15 1 1
## 754 757 758 759 760 761 762 763 764 2259 2529 2624 2709
## 1 1 1 1 1 1 1 1 1 1 15 1 15
## 2914 3555 3969 6584 6587 8050 11237 11525 11527 11903 12673 13342 13392
## 1 1 14 1 15 15 1 1 1 1 1 14 1
## 15273 15983 16613 25165 25172 30186 30897
## 1 1 1 1 1 1 15
sum(missing_per_row > 1)
## [1] 18
sum(missing_per_row[missing_per_row > 1])/sum(missing_per_row)
## [1] 0.9041096
We can visualise where in the series the missing values occur using the following function:
ggplot_na_distribution(df_clean$generation.biomass)
However, this only allows us to look at one variable at a time. There are other functions in the imputeTS package which can be useful for investigating missingness in a (univariate) time series.
We have decided to interpolate the missing values using linear interpolation. The only places where this might the case are two gaps of length 6 and 8. These will be visualised below.
# Interpolate missing values using linear interpolation
df_final <- df_clean %>% mutate(across(generation.biomass:price.actual, .fns = na_interpolation, option = 'linear'))
# Verifying missing values have been filled
df_final[!complete.cases(df_final),]
Plotting larger gaps where interpolation has occurred.
ggplot(data = df_final[0:120,]) + geom_line(aes(x=time, y = generation.biomass))
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
What are we left with in df_final?
35,000 obs across 20 variables, hourly from start 2015 to end 2018:
Energy generation from 14 different sources
Forecast onshore wind and solar, one day ahead
Forecast and actual total electricity load
Forecast and actual electricity price
And in the other dataset, not yet loaded: hourly weather data over the same period for a number of Spanish cities
# Formatting time variable to datetime object
# NB: Time contains both UTC+1 and UTC+2 time-zones - date functions automatically pick this up and convert to UTC
df_final$time <- as_datetime(df_final$time)
head(df_final)
gen_data <- df_final %>%
select(c(1, starts_with("generation")))
# Giving vars simpler names
names(gen_data) <- c("time",
"Biomass",
"Lignite",
"Gas",
"Hard Coal",
"Oil",
"Hydro Pumped",
"Hydro River",
"Hydro Reservoir",
"Nuclear",
"Other",
"Other Renewable",
"Solar",
"Waste",
"Wind Onshore")
# Converting to long form
long_gen_data <- gen_data %>%
pivot_longer(cols = -time, names_to = "generation_source")
head(long_gen_data)
Getting statistics over entire dataset for each source
overall_stats <- long_gen_data %>%
group_by(generation_source) %>%
summarise(sum = sum(value, na.rm = TRUE),
mean = mean(value, na.rm = TRUE),
sd = sd(value, na.rm = TRUE),
max = max(value, na.rm = TRUE)) %>%
mutate(generation_source= fct_reorder(generation_source, desc(sum)))
sources_by_sum <- arrange(overall_stats, desc(sum))
sources_by_sum
ggplot(data = overall_stats) + geom_col(aes(x = generation_source, y = sum)) + theme(axis.text.x = element_text(angle=90))
sources_by_sum %>% pivot_longer(cols = c(sum, mean, sd, max), names_to = "stat",
values_to = "value") %>%
filter(stat != "sum") %>%
ggplot() + geom_col(aes(x = generation_source, y = value, fill = stat), position = "dodge") + theme(axis.text.x = element_text(angle=90))
Using this information to re-factor the generation_source variable, so that plots will order the sources from biggest to smallest
long_gen_data$generation_source <- factor(long_gen_data$generation_source, levels = sources_by_sum$generation_source)
Smoothing the data on a daily scale
gen_data_smoothed <- long_gen_data %>%
group_by(generation_source) %>%
summarise_by_time(time, .by = 'day', adjusted = mean(value))
head(gen_data_smoothed)
Looking at four series over time in isolation, with quarterly smoothing (This function is nice as it includes an interactive feature)
gen_data_smoothed %>%
filter(generation_source %in% sources_by_sum$generation_source[1:6]) %>%
plot_time_series(time, adjusted, .facet_ncol = 3,
.smooth_period = "3 months",
.title= "Top generation sources over time, with 3-month mean smoothing")
long_gen_data %>%
group_by(generation_source) %>%
summarise_by_time(time, .by = '3 months', adjusted = mean(value)) %>%
ggplot() +
geom_area(aes(x = time, y = adjusted, fill = generation_source), stat = "identity")
# Proportions
df_props <- cbind(df_final[1], prop.table(as.matrix(df_final[-1]), margin = 1))
long_gen_data_props <- df_props %>%
pivot_longer(cols = -time, names_to = "generation_source") %>%
mutate(generation_source = str_replace(generation_source, "generation.", ""))
long_gen_data_props %>%
group_by(generation_source) %>%
summarise_by_time(time, .by = '3 months', adjusted = mean(value)) %>%
ggplot() +
geom_area(aes(x = time, y = adjusted, fill = generation_source), stat = "identity") +
theme_minimal()
# Correlation plot. Sample size for computational efficiency.
df_corr <- gen_data[1:10000,2:ncol(gen_data)]
df_corr %>% cor() %>%
ggcorrplot(.,
hc.order = TRUE,
type = "lower",
lab = FALSE)
# Create time variables
long_gen_data$year <- year(long_gen_data$time)
long_gen_data$month <- month(long_gen_data$time)
long_gen_data$day <- day(long_gen_data$time)
long_gen_data$hour <- hour(long_gen_data$time)
long_gen_data_props$year <- year(long_gen_data_props$time)
long_gen_data_props$month <- month(long_gen_data_props$time)
long_gen_data_props$day <- day(long_gen_data_props$time)
long_gen_data_props$hour <- hour(long_gen_data_props$time)
Seasonality within a day
hourly_generation_plot_by_year <- long_gen_data %>%
ggplot(aes(x = hour, y = value, fill = generation_source)) +
geom_bar(stat = "identity")
hourly_generation_plot_by_year
# Proportions
hourly_generation_plot_by_year_props <- long_gen_data_props %>%
ggplot(aes(x = hour, y = value, fill = generation_source)) +
geom_bar(stat = "identity")
hourly_generation_plot_by_year_props
Seasonality within a month
daily_generation_plot_by_year <- long_gen_data %>%
ggplot(aes(x = day, y = value, fill = generation_source)) +
geom_bar(stat = "identity") +
facet_wrap(~year) + theme(legend.position = "none")
daily_generation_plot_by_year
# Note: Similar problem to raw values
daily_generation_plot_by_year_props <- long_gen_data_props %>%
ggplot(aes(x = day, y = value, fill = generation_source)) +
geom_bar(stat = "identity") +
facet_wrap(~year) + theme(legend.position = "none")
daily_generation_plot_by_year_props
Seasonality within a year
monthly_generation_plot <- long_gen_data %>%
ggplot(aes(x = month, y = value, fill = generation_source)) +
geom_bar(stat = "identity") +
facet_wrap(~year)
monthly_generation_plot
# Note: Shorter bars for shorter months as above
monthly_generation_plot_props <- long_gen_data_props %>%
ggplot(aes(x = month, y = value, fill = generation_source)) +
geom_bar(stat = "identity") +
facet_wrap(~year)
monthly_generation_plot_props